{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import fasttext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = fasttext.train_supervised('shuf-train-fasttext.txt', dim = 16, minn = 2, loss = 'hs', )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model.save_model(\"model.bin\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "model = fasttext.load_model('model.bin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "cutoff = int(len(model.get_words()) * 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 44 µs, sys: 12 µs, total: 56 µs\n",
      "Wall time: 57 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__local-malay',\n",
       "  '__label__local-english',\n",
       "  '__label__socialmedia-indonesian'),\n",
       " array([9.95590746e-01, 4.33338946e-03, 1.28890577e-04]))"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"bodo siak\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 20 µs, sys: 5 µs, total: 25 µs\n",
      "Wall time: 26 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__socialmedia-indonesian', '__label__local-malay'),\n",
       " array([9.99221981e-01, 8.17990163e-04]))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"rada ketuaan kalo umur masih pas lebih bagus lg aishwarya\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 17 µs, sys: 4 µs, total: 21 µs\n",
      "Wall time: 21.9 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__local-malay',\n",
       "  '__label__standard-malay',\n",
       "  '__label__socialmedia-indonesian'),\n",
       " array([9.99312460e-01, 6.87691150e-04, 2.68310596e-05]))"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"tak sedap mane pun\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 0 ns, sys: 20 µs, total: 20 µs\n",
      "Wall time: 21 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__socialmedia-indonesian',\n",
       "  '__label__local-malay',\n",
       "  '__label__standard-indonesian'),\n",
       " array([0.56794089, 0.40412852, 0.01983567]))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"nggak ada\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"rada ketuaan kalo umur masih pas lebih bagus lg aishwarya\"\r\n",
      "\"ada kursi gaa soalnya nunggu kamu pegel kalo sambil berdiri\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 filter-twitter-malay-rojak-id.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"\\u60f3\\u3002\\u5e02\\u6c11\\u90ed\\u5148\\u751f\\u8ba4\\u4e3a\\uff0c\\u6e05\\u660e\\u5c0f\\u957f\\u5047\\u3001\\u201c\\u4e94\\u4e00\\u201d\\u5c0f\\u957f\\u5047\\u90fd\\u53ea\\u6709\\u4e09\\u5929\\uff0c\\u5f88\\u591a\\u5e02\\u6c11\\u53ea\\u80fd\\u53bb\\u5e02\\u533a\\u3001\\u90ca\\u533a\\u9876\\u591a\\u5317\\u4eac\\u5468\\u8fb9\\u6e38\\uff0c\\u5982\\u679c\\u80fd\\u591f\\u589e\\u52a0\\u5047\\u671f\\uff0c\\u90e8\\u5206\\u5e02\\u6c11\\u53ef\\u4ee5\\u8fdb\\u884c\\u957f\\u9014\\u65c5\\u884c\\uff0c\\u6216\\u8bb8\\u53ef\\u4ee5\\u8d77\\u5230\\u7f13\\u89e3\\u6e38\\u5ba2\\u538b\\u529b\\u7684\\u76ee\\u7684\\u3002\\u8457\\u540d\\u65c5\\u6e38\\u4e13\\u5bb6\\u5218\\u601d\\u654f\\u5728\\u63a5\\u53d7\\u5317\\u4eac\\u6668\\u62a5\\u91c7\\u8bbf\\u65f6\\u591a\\u6b21\\u6307\\u51fa\\uff0c\\u76ee\\u524d\\u4ecd\\u5b58\\u5728\\u957f\\u5047\\u77ed\\u7f3a\\u7684\\u95ee\\u9898\\uff0c\\u5e94\\u8be5\\u6062\\u590d\\u201c\\u4e94\\u4e00\\u201d\\u9ec4\\u91d1\\u5468\\uff0c\\u8fd9\\u662f\\u6e38\\u5ba2\\u7684\\u521a\\u6027\\u9700\\u6c42\\u3002\"\r\n",
      "\"\\u3010cctv \\u300a\\u671d\\u95fb\\u5929\\u4e0b\\u300b\\u3011\\u674e\\u514b\\u5f3a\\u4f1a\\u89c1\\u4e16\\u754c\\u7ecf\\u6d4e\\u8bba\\u575b\\u4e3b\\u5e2d\\u65bd\\u74e6\\u5e03\\u3002\"\r\n"
     ]
    }
   ],
   "source": [
    "!head -n 2 prepare-standard-mandarin.jsonl.dedup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.quantize(qnorm=True, cutoff = cutoff)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_model(\"model.ftz\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 64 µs, sys: 9 µs, total: 73 µs\n",
      "Wall time: 75.1 µs\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(('__label__socialmedia-indonesian',\n",
       "  '__label__local-malay',\n",
       "  '__label__standard-indonesian'),\n",
       " array([0.42140445, 0.39739412, 0.08959632]))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "model.predict(\"nggak ada\", k = 3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
